Applying our zero-inflated model to the cortical data (all layets).
We select only the cells that pass QC, color coded by the layer of origin. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it will work only on Davide’s system (for now).
data("cortical")
glia0 <-c("DSJN001_N710_S505", "DSJN001_N702_S510", "DSJN003_N724_S521", "DSJN003_N721_S522", "DSJN003_N726_S522", "DSJN003_N728_S518", "DSJN003_N727_S517", "DSJN003_N728_S520", "DSJN003_N728_S515", "DSJN003_N728_S513", "DS07_N703_S506", "DS07_N703_S502", "DS07_N702_S506", "DS07_N701_S505", "DS07_N703_S505", "DS07_N701_S508", "DS07_N702_S505", "DS07_N704_S506", "DS07_N714_S506", "DS07_N715_S505", "DSJN001_N724_S503", "DSJN002_N703_S507", "DSJN002_N702_S502", "DSJN002_N705_S508", "DSJN002_N702_S510", "DSJN002_N711_S502", "DSJN003_N701_S507", "DSJN002_N706_S511", "DSJN002_N715_S502", "DSJN002_N712_S502", "DSJN003_N705_S506", "DSJN003_N704_S507", "DSJN002_N724_S515", "DSJN002_N722_S513", "DSJN002_N720_S521", "DSJN002_N719_S517", "DSJN002_N718_S521", "DSJN002_N728_S513", "DSJN003_N718_S513", "DSJN002_N728_S516", "DS08_N719_S503", "DS08_N723_S507")
# exclude glia and bulk samples
neurons <- cortical[,which(!(colnames(cortical) %in% glia0 | colData(cortical)$MD_single_bulk_pool == "B"))]
# removing ERCC spike ins
neurons <- neurons[grep("^ERCC-", rownames(neurons), invert = TRUE),]
# removing low quality samples
sf <- metric_sample_filter(assay(neurons), nreads=colData(neurons)$NREADS, ralign=colData(neurons)$RALIGN, zcut=2)
keep <- which(!(sf$filtered_nreads | sf$filtered_ralign))
# plot(colData(neurons)$NREADS, colData(neurons)$RALIGN)
# points(colData(neurons)$NREADS[which((sf$filtered_nreads | sf$filtered_ralign))], colData(neurons)$RALIGN[which((sf$filtered_nreads | sf$filtered_ralign))], col=2, pch=20)
filtered <- neurons[, keep]
filter <- apply(assay(filtered)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 8721
To speed up the computations, I will focus on the top 1,000 most variable genes.
filtered <- filtered[filter,]
core <- assay(filtered)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
cols <- brewer.pal(9, "Set1")
colMerged <- cols[colData(filtered)$MD_cell_type]
detection_rate <- colSums(core>0)
coverage <- colSums(core)
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")
df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.00000000 -0.054470256 0.84600973 0.922858873
## PC2 -0.05447026 1.000000000 0.02734315 0.002126654
## detection_rate 0.84600973 0.027343152 1.00000000 0.625824488
## coverage 0.92285887 0.002126654 0.62582449 1.000000000
Now, let’s see how ZINB works.
print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 227.633 59.520 127.503
plot(zinb_Vall@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.0000000000 -0.0004612369 0.70920538 -0.6747868
## W2 -0.0004612369 1.0000000000 0.08442253 0.1068222
## gamma_mu 0.7092053768 0.0844225267 1.00000000 -0.4496961
## gamma_pi -0.6747868376 0.1068222474 -0.44969613 1.0000000
## detection_rate 0.7191271943 -0.0746530992 0.55727837 -0.9848206
## coverage 0.7223596417 0.0477250495 0.98785251 -0.5252069
## detection_rate coverage
## W1 0.7191272 0.72235964
## W2 -0.0746531 0.04772505
## gamma_mu 0.5572784 0.98785251
## gamma_pi -0.9848206 -0.52520693
## detection_rate 1.0000000 0.62582449
## coverage 0.6258245 1.00000000
batch <- droplevels(colData(filtered)$MD_c1_run_id)
bio <- droplevels(colData(filtered)$MD_cell_type)
mod <- model.matrix(~batch)
colBatch <- tapply(colMerged, batch, function(x) x[1])
boxplot(df$W1~batch, col=colBatch)
boxplot(df$W2~batch, col=colBatch)
print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 2, X=mod)))
## user system elapsed
## 807.717 346.243 435.708
plot(zinb_X@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")
Not surprisingly, removing the batch effects also removes the biology, because of confounded design.
#total number of detected genes in the cell
df <- data.frame(W1=zinb_X@W[,1], W2=zinb_X@W[,2], gamma_mu = zinb_X@gamma_mu[1,], gamma_pi = zinb_X@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 -0.06642216 0.5257920 -0.7578291
## W2 -0.06642216 1.00000000 -0.5618436 0.3444663
## gamma_mu 0.52579195 -0.56184363 1.0000000 -0.5913995
## gamma_pi -0.75782915 0.34446628 -0.5913995 1.0000000
## detection_rate 0.75461410 -0.36227768 0.6667068 -0.9768876
## coverage 0.47169513 -0.58863094 0.9836926 -0.5541336
## detection_rate coverage
## W1 0.7546141 0.4716951
## W2 -0.3622777 -0.5886309
## gamma_mu 0.6667068 0.9836926
## gamma_pi -0.9768876 -0.5541336
## detection_rate 1.0000000 0.6258245
## coverage 0.6258245 1.0000000
boxplot(df$W1~batch, col=colBatch)
boxplot(df$W2~batch, col=colBatch)
qc <- as.matrix(colData(filtered)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~qcpca$x[,1:5])
plot(df$W1~qcpca$x[,1], pch=19, col=colMerged)
plot(df$W2~qcpca$x[,1], pch=19, col=colMerged)
print(system.time(zinb_q <- zinbFit(core, ncores = 3, K = 2, X=mod)))
## user system elapsed
## 376.159 81.445 175.908
plot(zinb_q@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_q@W[,1], W2=zinb_q@W[,2], gamma_mu = zinb_q@gamma_mu[1,], gamma_pi = zinb_q@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.000000000 0.05303947 -0.08038000 0.1263505
## W2 0.053039473 1.00000000 -0.04453885 -0.1485065
## gamma_mu -0.080379998 -0.04453885 1.00000000 -0.5122902
## gamma_pi 0.126350534 -0.14850652 -0.51229019 1.0000000
## detection_rate -0.131804715 0.15596995 0.60868408 -0.9848994
## coverage -0.008229641 -0.11917482 0.98324333 -0.5394186
## detection_rate coverage
## W1 -0.1318047 -0.008229641
## W2 0.1559700 -0.119174817
## gamma_mu 0.6086841 0.983243329
## gamma_pi -0.9848994 -0.539418610
## detection_rate 1.0000000 0.625824488
## coverage 0.6258245 1.000000000
boxplot(df$W1~batch, col=colBatch)
boxplot(df$W2~batch, col=colBatch)
Better, but W may still be influenced by batch effects.
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
## user system elapsed
## 414.668 84.572 192.097
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 W3 gamma_mu gamma_pi
## W1 1.00000000 -0.01732276 0.01513701 -0.01169311 0.41084352
## W2 -0.01732276 1.00000000 0.03032168 -0.06768404 0.03405243
## W3 0.01513701 0.03032168 1.00000000 -0.01524494 0.13486914
## gamma_mu -0.01169311 -0.06768404 -0.01524494 1.00000000 -0.56670097
## gamma_pi 0.41084352 0.03405243 0.13486914 -0.56670097 1.00000000
## detection_rate -0.41145339 -0.03753188 -0.11795051 0.65777702 -0.98460392
## coverage 0.09691373 -0.02171400 -0.05042026 0.98158891 -0.54548684
## detection_rate coverage
## W1 -0.41145339 0.09691373
## W2 -0.03753188 -0.02171400
## W3 -0.11795051 -0.05042026
## gamma_mu 0.65777702 0.98158891
## gamma_pi -0.98460392 -0.54548684
## detection_rate 1.00000000 0.62582449
## coverage 0.62582449 1.00000000
boxplot(df$W1~batch, col=colBatch)
boxplot(df$W2~batch, col=colBatch)
boxplot(df$W3~batch, col=colBatch)
# mod <- make_design(bio, batch, W=NULL, nested=TRUE)
mod <- model.matrix(~bio)
print(system.time(zinb_bio <- zinbFit(core, ncores = 3, K = 2, X=mod)))
## user system elapsed
## 276.304 70.685 135.794
plot(zinb_bio@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")
quality <- qcpca$x[,1]
data.frame(W1=zinb_bio@W[,1], W2=zinb_bio@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=quality), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
print(system.time(zinb_noW <- zinbFit(core, ncores = 3, K = 0, X=mod)))
## user system elapsed
## 188.594 53.394 103.411
par(mfrow=c(1, 2))
data(cortical_markers)
L4markers <- cortical_markers[cortical_markers[,2] %in% c("L4", "L2/3,L4") ,1]
L5markers <- cortical_markers[cortical_markers[,2] %in% c("L5", "L5b"), 1]
L6markers <- cortical_markers[cortical_markers[,2] %in% c("L6"), 1]
meanL4 <- zinb_bio@beta_mu[1,]
meanL5 <- zinb_bio@beta_mu[1,] + zinb_bio@beta_mu[2,]
meanL6 <- zinb_bio@beta_mu[1,] + zinb_bio@beta_mu[3,]
logFC <- zinb_bio@beta_mu[2,]
mean <- (meanL4+meanL5)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 5 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L5markers], logFC[L5markers], col=4, pch=20)
legend("topright", c("L4 markers", "L5 markers"), pch=20, col=c(2, 4))
logFC <- zinb_bio@beta_mu[3,]
mean <- (meanL4+meanL6)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 6 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L6markers], logFC[L6markers], col=4, pch=20)
legend("topright", c("L4 markers", "L6 markers"), pch=20, col=c(2, 4))
par(mfrow=c(1, 2))
meanL4 <- zinb_noW@beta_mu[1,]
meanL5 <- zinb_noW@beta_mu[1,] + zinb_noW@beta_mu[2,]
meanL6 <- zinb_noW@beta_mu[1,] + zinb_noW@beta_mu[3,]
logFC <- zinb_noW@beta_mu[2,]
mean <- (meanL4+meanL5)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 5 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L5markers], logFC[L5markers], col=4, pch=20)
legend("topright", c("L4 markers", "L5 markers"), pch=20, col=c(2, 4))
logFC <- zinb_noW@beta_mu[3,]
mean <- (meanL4+meanL6)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 6 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L6markers], logFC[L6markers], col=4, pch=20)
legend("topright", c("L4 markers", "L6 markers"), pch=20, col=c(2, 4))